Team Members

Javier Huamani NetID: Huamani2 UIN: 654292064

Sudha Natarajan NetID: Sudha2 UIN: 662428072

Contributions

Contribution from each team member:

  • Javier Huamani: Completed the entirety of Part1 of Coding assignment 4 including the derivations.
  • Sudha Natarajan: Completed the entirety of Part2 of Coding assignment 4.

Part 1

Prepared EM Functions

Estep <- function(data, G, para){
  # Your Code
  # Return the n-by-G probability matrix
  n = dim(data)[1]
  z = outer(1:n, 1:G,FUN = Vectorize(function(i, j) 
    -(1/2) * as.matrix(data[i,] - para$mean[,j]) %*% solve(para$Sigma) %*% t(as.matrix(data[i,] - para$mean[,j])) + log(para$prob[j]) )
  )
  
  # Subtracting max value of z to handle underflow problem
  z_max = apply(z, 1, max)
  z = exp(z - z_max) / rowSums(exp(z - z_max))
  
  z
}
Mstep <- function(data, G, para, post.prob){ 
  # Your Code
  # Return the updated parameters
  n = dim(data)[1]
  d = dim(data)[2]
  
  para$prob = colSums(post.prob) / n
  
  para$mean = outer(1:d, 1:G,FUN = Vectorize(function(i, j) 
    sum(as.matrix(post.prob[,j]*data[,i])) / sum(as.matrix(post.prob[,j]))
  ) )
  
  para$Sigma = matrix(0,d,d)
  
  for (g in 1:G){
    for (i in 1:n){
      para$Sigma = para$Sigma + (t(data[i,]) - para$mean[,g]) %*% t(t(data[i,]) - para$mean[,g]) * post.prob[i,g]
    }
  }
  para$Sigma = para$Sigma / n
  para
}
myEM <- function(data, itmax, G, para){
  # itmax: num of iterations
  # G:     num of components
  # para:  list of parameters (prob, mean, Sigma)
  for(t in 1:itmax){
    post.prob <- Estep(data, G, para)
    para <- Mstep(data, G, para, post.prob)
  }
  return(para)
}
options(digits=8)
options()$digits
## [1] 8

Test Functions

library(mclust)
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
dim(faithful)
## [1] 272   2
head(faithful)
n = nrow(faithful)

Two Clusters

K <- 2
set.seed(2064)  
gID <- sample(1:K, n, replace = TRUE)
Z <- matrix(0, n, K)
for(k in 1:K)
  Z[gID == k, k] <- 1 
ini0 <- mstep(modelName="EEE", faithful , Z)$parameters
para0 <- list(prob = ini0$pro, 
              mean = ini0$mean, 
              Sigma = ini0$variance$Sigma)
para0
## $prob
## [1] 0.5 0.5
## 
## $mean
##                 [,1]       [,2]
## eruptions  3.5146544  3.4609118
## waiting   71.2573529 70.5367647
## 
## $Sigma
##            eruptions    waiting
## eruptions  1.2972168  13.916737
## waiting   13.9167373 184.014003
myEM(data=faithful, itmax=20, G=K, para=para0)
## $prob
## [1] 0.50002545 0.49997455
## 
## $mean
##            [,1]       [,2]
## [1,]  3.5148192  3.4607442
## [2,] 71.2592307 70.5348501
## 
## $Sigma
##            eruptions    waiting
## eruptions  1.2972079  13.916626
## waiting   13.9166261 184.012633
Rout <- em(modelName = "EEE", data = faithful,
           control = emControl(eps=0, tol=0, itmax = 20), 
           parameters = ini0)$parameters
list(Rout$pro, Rout$mean, Rout$variance$Sigma)
## [[1]]
## [1] 0.50002545 0.49997455
## 
## [[2]]
##                 [,1]       [,2]
## eruptions  3.5148192  3.4607442
## waiting   71.2592307 70.5348501
## 
## [[3]]
##            eruptions    waiting
## eruptions  1.2972079  13.916626
## waiting   13.9166261 184.012633

The results from our implementation of the EM algorithm and the built in em function matched exactly to the required precision.

Three Clusters

K <- 3
gID <- sample(1:K, n, replace = TRUE)
Z <- matrix(0, n, K)
for(k in 1:K)
  Z[gID == k, k] <- 1 
ini0 <- mstep(modelName="EEE", faithful , Z)$parameters
para0 <- list(prob = ini0$pro, 
              mean = ini0$mean, 
              Sigma = ini0$variance$Sigma)
para0
## $prob
## [1] 0.30147059 0.34558824 0.35294118
## 
## $mean
##                 [,1]       [,2]       [,3]
## eruptions  3.4613415  3.5320851  3.4669896
## waiting   70.5000000 70.3404255 71.7812500
## 
## $Sigma
##            eruptions    waiting
## eruptions  1.2968972  13.938265
## waiting   13.9382649 183.713282
myEM(data=faithful, itmax=20, G=K, para=para0)
## $prob
## [1] 0.30166706 0.34746712 0.35086582
## 
## $mean
##            [,1]       [,2]       [,3]
## [1,]  3.4463918  3.5422278  3.4694531
## [2,] 69.9897649 70.4347856 72.1349264
## 
## $Sigma
##            eruptions    waiting
## eruptions  1.2962742  13.931796
## waiting   13.9317964 183.283598
Rout <- em(modelName = "EEE", data = faithful,
           control = emControl(eps=0, tol=0, itmax = 20), 
           parameters = ini0)$parameters
list(Rout$pro, Rout$mean, Rout$variance$Sigma)
## [[1]]
## [1] 0.30166706 0.34746712 0.35086582
## 
## [[2]]
##                 [,1]       [,2]       [,3]
## eruptions  3.4463918  3.5422278  3.4694531
## waiting   69.9897649 70.4347856 72.1349264
## 
## [[3]]
##            eruptions    waiting
## eruptions  1.2962742  13.931796
## waiting   13.9317964 183.283598

The results from our implementation of the EM algorithm and the built in em function matched exactly to the required precision.

Derivations

1 and 2

1 and 2

3 and 4

3 and 4

5

5

5

5

5

5

Part II

The Baum-Welch algorihtm

The Baum-Welch Algorihtm is the EM algorithm for HMM. You should prepare a function BW.onestep to perform the E-step and M-step, and then iteratively call that function in myBW.

Note that We do not update w.

myBW = function(x, para, n.iter = 100){
  # Input:
  # x: T-by-1 observation sequence
  # para: initial parameter value
  # Output updated para value (A and B; we do not update w)
  
  for(i in 1:n.iter){
    para = BW.onestep(x, para)
  }
  return(para)
}

Your function BW.onestep, in which we operate the E-step and M-step for one iteration, should look as follows.

BW.onestep = function(x, para){
  
  T = length(x)
  mz = para$mz
  mx = para$mx
  A = para$A
  B = para$B
  w = para$w
  alp = forward.prob(x, para)
  beta = backward.prob(x, para)
  
myGamma = array(0, dim=c(mz, mz, T-1))
  ## YOUR CODE: 
  ## Compute gamma_t(i,j) P(Z[t] = i, Z[t+1]=j), 
  ## for t=1:T-1, i=1:mz, j=1:mz, 
  ## which are stored an array, myGamma
  for (t in 1:(T-1)){
    for(i in 1:mz)
      for(j in 1:mz)
        myGamma[i,j,t]= alp[t,i]*A[i,j]* B[j, x[t+1]] *beta[t+1,j]
  }


  # M-step for parameter A
  A = rowSums(myGamma, dims = 2)
  A = A/rowSums(A)
  
  
  # M-step for parameter B
  tmp = apply(myGamma, c(1, 3), sum)  # mz-by-(T-1)
  tmp = cbind(tmp, colSums(myGamma[, , T-1]))
  for(l in 1:mx){
    B[, l] = rowSums(tmp[, which(x==l)])
  }
  B = B/rowSums(B)
  
  para$A = A
  para$B = B
  return(para)

}

You can compute the forward and backward probabilities using the following functions.

forward.prob = function(x, para){
  # Output the forward probability matrix alp 
  # alp: T by mz, (t, i) entry = P(x_{1:t}, Z_t = i)
  T = length(x)
  mz = para$mz
  A = para$A
  B = para$B
  w = para$w
  alp = matrix(0, T, mz)
  
  # fill in the first row of alp
  alp[1, ] = w * B[, x[1]]
  # Recursively compute the remaining rows of alp
  for(t in 2:T){
    tmp = alp[t-1, ] %*% A
    alp[t, ] = tmp * B[, x[t]]
    }
  return(alp)
}


backward.prob = function(x, para){
  # Output the backward probability matrix beta
  # beta: T by mz, (t, i) entry = P(x_{1:t}, Z_t = i)
  T = length(x)
  mz = para$mz
  A = para$A
  B = para$B
  w = para$w
  beta = matrix(1, T, mz)
  
  # The last row of beta is all 1.
  # Recursively compute the previous rows of beta
  for(t in (T-1):1){
    tmp = as.matrix(beta[t+1, ] * B[, x[t+1]])  # make tmp a column vector
    beta[t, ] = t(A %*% tmp)
    }
  return(beta)
}

The Viterbi algorihtm

The Viterbi algorihtm returns the most probable latent sequence given the data and the MLE of parameters.

myViterbi = function(x, para){
  # Output: most likely sequence of Z (T-by-1)
  T = length(x)
  mz = para$mz
  A = para$A
  B = para$B
  w = para$w
  log.A = log(A)
  log.w = log(w)
  log.B = log(B)
  
  # Compute delta (in log-scale)
  delta = matrix(0, T, mz) 
  # fill in the first row of delta
  delta[1, ] = log.w + log.B[, x[1]]
  
  ## YOUR CODE: 
  for (t in 2:T){
    for (i in 1:mz){
      delta[t,i] = max(delta[t-1,]+log.A[,i])+log.B[i, x[t]]
    }
  }
  
  ## Recursively compute the remaining rows of delta
  
  # Compute most prob sequence Z
  Z = rep(0, T)
  # start with the last entry of Z
  Z[T] = which.max(delta[T, ])
  
  ## YOUR CODE: 
  Z[T] = which.max(delta[T,])
  for (t in (T-1):1){
    Z[t] = which.max(delta[t,]+log.A[,Z[t+1]])
  }
  ## Recursively compute the remaining entries of Z
  
  return(Z)
}

Test your function

Your result

Try your code on the data provided on Campuswire. You can (i) use the initial values specified below or (ii) use your own initial values. For the latter, remember to set the seed as the last four digits of your UIN.

data = scan("coding4_part2_data.txt")
mz = 2
mx = 3
ini.w = rep(1, mz); ini.w = ini.w / sum(ini.w)
ini.A = matrix(1, 2, 2); ini.A = ini.A / rowSums(ini.A)
ini.B = matrix(1:6, 2, 3); ini.B = ini.B / rowSums(ini.B)
ini.para = list(mz = 2, mx = 3, w = ini.w,A = ini.A, B = ini.B)
myout = myBW(data, ini.para, n.iter = 100)
myout.Z = myViterbi(data, myout)
myout.Z[myout.Z==1] = 'A'
myout.Z[myout.Z==2] = 'B'

Result from HMM

Call R package HMM

library(HMM)
hmm0 =initHMM(c("A", "B"), c(1, 2, 3),
              startProbs = ini.w,
              transProbs = ini.A, 
              emissionProbs = ini.B)
Rout = baumWelch(hmm0, data, maxIterations=100, delta=1E-9, pseudoCount=0)
Rout.Z = viterbi(Rout$hmm, data)

Compare two results

options(digits=8)
options()$digits
## [1] 8

Compare estimates for transition prob matrix A

myout$A
##            [,1]       [,2]
## [1,] 0.49793938 0.50206062
## [2,] 0.44883431 0.55116569
Rout$hmm$transProbs
##     to
## from          A          B
##    A 0.49793938 0.50206062
##    B 0.44883431 0.55116569

Compare estimates for emission prob matrix B

myout$B
##            [,1]       [,2]       [,3]
## [1,] 0.22159897 0.20266127 0.57573976
## [2,] 0.34175148 0.17866665 0.47958186
Rout$hmm$emissionProbs
##       symbols
## states          1          2          3
##      A 0.22159897 0.20266127 0.57573976
##      B 0.34175148 0.17866665 0.47958186

Compare the most probable Z sequence.

cbind(Rout.Z, myout.Z)[c(1:10, 180:200), ]
##       Rout.Z myout.Z
##  [1,] "A"    "A"    
##  [2,] "A"    "A"    
##  [3,] "A"    "A"    
##  [4,] "A"    "A"    
##  [5,] "A"    "A"    
##  [6,] "A"    "A"    
##  [7,] "A"    "A"    
##  [8,] "B"    "B"    
##  [9,] "A"    "A"    
## [10,] "A"    "A"    
## [11,] "B"    "B"    
## [12,] "A"    "A"    
## [13,] "A"    "A"    
## [14,] "A"    "A"    
## [15,] "A"    "A"    
## [16,] "A"    "A"    
## [17,] "A"    "A"    
## [18,] "A"    "A"    
## [19,] "A"    "A"    
## [20,] "B"    "B"    
## [21,] "B"    "B"    
## [22,] "B"    "B"    
## [23,] "B"    "B"    
## [24,] "B"    "B"    
## [25,] "A"    "A"    
## [26,] "A"    "A"    
## [27,] "A"    "A"    
## [28,] "A"    "A"    
## [29,] "A"    "A"    
## [30,] "A"    "A"    
## [31,] "A"    "A"
sum(Rout.Z != myout.Z)
## [1] 0

Results match exactly.